bitkeeper revision 1.1159.219.1 (41e4eca6eauLGi5osqpDb_OmwLgatA)
authorakw27@labyrinth.cl.cam.ac.uk <akw27@labyrinth.cl.cam.ac.uk>
Wed, 12 Jan 2005 09:23:50 +0000 (09:23 +0000)
committerakw27@labyrinth.cl.cam.ac.uk <akw27@labyrinth.cl.cam.ac.uk>
Wed, 12 Jan 2005 09:23:50 +0000 (09:23 +0000)
Some fixes and cleanups to the blktap code.

linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.c
linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.h
linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_datapath.c
linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_userdev.c

index 5e7d47c58f576b15701e9273f3d075d94ec29d8c..e4fbf390bc9525b192b9fa955ec3c5904a9f5b25 100644 (file)
@@ -41,9 +41,9 @@ int __init xlblk_init(void)
     DPRINTK("   tap - Frontend connection init:\n");
     
     active_reqs_init();
+    blkif_interface_init();
+    blkdev_schedule_init();
     
-    ptfe_blkif.status = DISCONNECTED;
-
     (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
                                     CALLBACK_IN_BLOCKING_CONTEXT);
 
index 7e5d73ddf7119e821b111fd65beb801dfb02f55f..2d67d592fc44223c6ee04a1873c519022d8027e6 100644 (file)
 #include <asm/pgalloc.h>
 #include <asm-xen/hypervisor.h>
 #include <asm-xen/xen-public/io/blkif.h>
+#include <asm-xen/xen-public/io/ring.h>
+
+/* Used to signal to the backend that this is a tap domain. */
+#define BLKTAP_COOKIE 0xbeadfeed
 
 /* -------[ debug / pretty printing ]--------------------------------- */
 
 #if 0
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#if 1
 #define ASSERT(_p) \
     if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
     __LINE__, __FILE__); *(int*)0=0; }
-#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
-                           __FILE__ , __LINE__ , ## _a )
 #else
 #define ASSERT(_p) ((void)0)
-#define DPRINTK(_f, _a...) ((void)0)
 #endif
 
 #define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
 
-/* -------[ connection / request tracking ]--------------------------- */
+
+/* -------[ connection tracking ]------------------------------------- */
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 #define VMALLOC_VMADDR(x) ((unsigned long)(x))
@@ -49,30 +58,40 @@ extern spinlock_t blkif_io_lock;
 
 typedef struct blkif_st {
     /* Unique identifier for this interface. */
-    domid_t          domid;
-    unsigned int     handle;
+    domid_t             domid;
+    unsigned int        handle;
     /* Physical parameters of the comms window. */
-    unsigned long    shmem_frame;
-    unsigned int     evtchn;
-    int              irq;
+    unsigned long       shmem_frame;
+    unsigned int        evtchn;
+    int                 irq;
     /* Comms information. */
-    blkif_ring_t    *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
-    BLKIF_RING_IDX     blk_req_cons;  /* Request consumer. */
-    BLKIF_RING_IDX     blk_resp_prod; /* Private version of resp. producer. */
+    blkif_back_ring_t   blk_ring;
     
     enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
     /*
      * DISCONNECT response is deferred until pending requests are ack'ed.
      * We therefore need to store the id from the original request.
-     */    u8               disconnect_rspid;
-    struct blkif_st *hash_next;
-    struct list_head blkdev_list;
-    spinlock_t       blk_ring_lock;
-    atomic_t         refcnt;
-    
+     */    
+    u8                  disconnect_rspid;
+    struct blkif_st    *hash_next;
+    struct list_head    blkdev_list;
+    spinlock_t          blk_ring_lock;
+    atomic_t            refcnt;
     struct work_struct work;
 } blkif_t;
 
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+void blkif_disconnect_complete(blkif_t *blkif);
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            blkif_disconnect_complete(_b);        \
+    } while (0)
+
+
+/* -------[ active request tracking ]--------------------------------- */
+
 typedef struct {
     blkif_t       *blkif;
     unsigned long  id;
@@ -80,48 +99,16 @@ typedef struct {
     unsigned long  mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     unsigned long  virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     int            next_free;
+    int inuse; /* debugging */
 } active_req_t;
 
+typedef unsigned int ACTIVE_RING_IDX;
 
-/* -------[ block ring structs ]-------------------------------------- */
-
-/* Types of ring. */
-#define BLKIF_REQ_RING_TYPE 1
-#define BLKIF_RSP_RING_TYPE 2
-
-/* generic ring struct. */
-typedef struct blkif_generic_ring_struct {
-    int type;
-} blkif_generic_ring_t;
-
-/* A requestor's view of a ring. */
-typedef struct blkif_req_ring_struct {
-
-    int type;                    /* Will be BLKIF_REQ_RING_TYPE        */
-    BLKIF_RING_IDX req_prod;     /* PRIVATE req_prod index             */
-    BLKIF_RING_IDX rsp_cons;     /* Response consumer index            */
-    blkif_ring_t *ring;          /* Pointer to shared ring struct      */
-
-} blkif_req_ring_t;
-
-#define BLKIF_REQ_RING_INIT { BLKIF_REQ_RING_TYPE, 0, 0, 0 }
-
-/* A responder's view of a ring. */
-typedef struct blkif_rsp_ring_struct {
-
-    int type;       
-    BLKIF_RING_IDX rsp_prod;     /* PRIVATE rsp_prod index             */
-    BLKIF_RING_IDX req_cons;     /* Request consumer index             */
-    blkif_ring_t *ring;          /* Pointer to shared ring struct      */
-
-} blkif_rsp_ring_t;
-
-#define BLKIF_RSP_RING_INIT = { BLKIF_RSP_RING_TYPE, 0, 0, 0 }
-
-#define RING(a) (blkif_generic_ring_t *)(a)
-
-inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring);
+active_req_t *lookup_active_req(ACTIVE_RING_IDX idx);
+inline unsigned int ID_TO_IDX(unsigned long id);
+inline domid_t ID_TO_DOM(unsigned long id);
 
+inline void active_reqs_init(void);
 
 /* -------[ interposition -> character device interface ]------------- */
 
@@ -135,6 +122,7 @@ inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring);
 #define BLKTAP_IOCTL_KICK_FE         1
 #define BLKTAP_IOCTL_KICK_BE         2
 #define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_PRINT_IDXS      100  
 
 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
 #define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
@@ -196,22 +184,12 @@ extern unsigned long mmap_vstart;
 #define RING_PAGES 128 
 extern unsigned long rings_vstart;
 
-/* -------[ Here be globals ]----------------------------------------- */
 
+/* -------[ Here be globals ]----------------------------------------- */
 extern unsigned long blktap_mode;
 
-
-/* blkif struct, containing ring to FE domain */
-extern blkif_t ptfe_blkif; 
-
 /* Connection to a single backend domain. */
-extern blkif_ring_t *blk_ptbe_ring;   /* Ring from the PT to the BE dom    */ 
-extern BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
-extern BLKIF_RING_IDX ptbe_req_prod;  /* Private request producer.         */
-
-/* Rings up to user space. */ 
-extern blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
-extern blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
+extern blkif_front_ring_t blktap_be_ring;
 
 /* Event channel to backend domain. */
 extern unsigned int blkif_ptbe_evtchn;
@@ -224,10 +202,13 @@ extern unsigned long blktap_ring_ok;
 /* init function for character device interface.                       */
 int blktap_init(void);
 
+/* init function for the blkif cache. */
+void __init blkif_interface_init(void);
+void __init blkdev_schedule_init(void);
+void blkif_deschedule(blkif_t *blkif);
+
 /* interfaces to the char driver, passing messages to and from apps.   */
 void blktap_kick_user(void);
-int blktap_write_to_ring(blkif_request_t *req);
-
 
 /* user ring access functions: */
 int blktap_write_fe_ring(blkif_request_t *req);
@@ -235,11 +216,12 @@ int blktap_write_be_ring(blkif_response_t *rsp);
 int blktap_read_fe_ring(void);
 int blktap_read_be_ring(void);
 
-/* and the helpers they call: */
-inline int write_resp_to_fe_ring(blkif_response_t *rsp);
-inline void kick_fe_domain(void);
+/* fe/be ring access functions: */
+int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp);
+int write_req_to_be_ring(blkif_request_t *req);
 
-inline int write_req_to_be_ring(blkif_request_t *req);
+/* event notification functions */
+inline void kick_fe_domain(blkif_t *blkif);
 inline void kick_be_domain(void);
 
 /* Interrupt handlers. */
@@ -250,5 +232,8 @@ irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs);
 /* Control message receiver. */
 extern void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id);
 
+/* debug */
+void print_vm_ring_idxs(void);
+        
 #define __BLKINT_H__
 #endif
index a3d485a6f3e25ffe9859daac9ec7599ee9c64675..b3cd1118973ded5526f41cfc1295ebd88955bd0d 100644 (file)
@@ -32,10 +32,71 @@ unsigned int blkif_ptbe_evtchn;
 
 /*-----[ Control Messages to/from Frontend VMs ]--------------------------*/
 
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static kmem_cache_t *blkif_cachep;
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif != NULL) && 
+            ((blkif->domid != domid) || (blkif->handle != handle)) )
+        blkif = blkif->hash_next;
+    return blkif;
+}
+
+static void __blkif_disconnect_complete(void *arg)
+{
+    blkif_t              *blkif = (blkif_t *)arg;
+    ctrl_msg_t            cmsg;
+    blkif_be_disconnect_t disc;
+
+    /*
+     * These can't be done in blkif_disconnect() because at that point there
+     * may be outstanding requests at the disc whose asynchronous responses
+     * must still be notified to the remote driver.
+     */
+    unbind_evtchn_from_irq(blkif->evtchn);
+    vfree(blkif->blk_ring.sring);
+
+    /* Construct the deferred response message. */
+    cmsg.type         = CMSG_BLKIF_BE;
+    cmsg.subtype      = CMSG_BLKIF_BE_DISCONNECT;
+    cmsg.id           = blkif->disconnect_rspid;
+    cmsg.length       = sizeof(blkif_be_disconnect_t);
+    disc.domid        = blkif->domid;
+    disc.blkif_handle = blkif->handle;
+    disc.status       = BLKIF_BE_STATUS_OKAY;
+    memcpy(cmsg.msg, &disc, sizeof(disc));
+
+    /*
+     * Make sure message is constructed /before/ status change, because
+     * after the status change the 'blkif' structure could be deallocated at
+     * any time. Also make sure we send the response /after/ status change,
+     * as otherwise a subsequent CONNECT request could spuriously fail if
+     * another CPU doesn't see the status change yet.
+     */
+    mb();
+    if ( blkif->status != DISCONNECTING )
+        BUG();
+    blkif->status = DISCONNECTED;
+    mb();
+
+    /* Send the successful response. */
+    ctrl_if_send_response(&cmsg);
+}
+
+void blkif_disconnect_complete(blkif_t *blkif)
+{
+    INIT_WORK(&blkif->work, __blkif_disconnect_complete, (void *)blkif);
+    schedule_work(&blkif->work);
+}
 
 void blkif_ptfe_create(blkif_be_create_t *create)
 {
-    blkif_t      *blkif;
+    blkif_t      *blkif, **pblkif;
     domid_t       domid  = create->domid;
     unsigned int  handle = create->blkif_handle;
 
@@ -43,16 +104,38 @@ void blkif_ptfe_create(blkif_be_create_t *create)
     /* May want to store info on the connecting domain here. */
 
     DPRINTK("PT got BE_CREATE\n");
-    blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
+
+    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
+    {
+        DPRINTK("Could not create blkif: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
 
     /* blkif struct init code from blkback.c */
     memset(blkif, 0, sizeof(*blkif));
     blkif->domid  = domid;
     blkif->handle = handle;
-    blkif->status = DISCONNECTED;    
+    blkif->status = DISCONNECTED;  
     spin_lock_init(&blkif->blk_ring_lock);
     atomic_set(&blkif->refcnt, 0);
 
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif != NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            DPRINTK("Could not create blkif: already exists\n");
+            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+            kmem_cache_free(blkif_cachep, blkif);
+            return;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+
+    blkif->hash_next = *pblkif;
+    *pblkif = blkif;
+
     create->status = BLKIF_BE_STATUS_OKAY;
 }
 
@@ -61,24 +144,59 @@ void blkif_ptfe_destroy(blkif_be_destroy_t *destroy)
 {
     /* Clear anything that we initialized above. */
 
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
     DPRINTK("PT got BE_DESTROY\n");
+    
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif = *pblkif) != NULL )
+    {
+        if ( (blkif->domid == domid) && (blkif->handle == handle) )
+        {
+            if ( blkif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pblkif = &blkif->hash_next;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pblkif = blkif->hash_next;
+    kmem_cache_free(blkif_cachep, blkif);
     destroy->status = BLKIF_BE_STATUS_OKAY;
 }
 
 void blkif_ptfe_connect(blkif_be_connect_t *connect)
 {
-    domid_t       domid  = connect->domid;
-    /*unsigned int  handle = connect->blkif_handle;*/
-    unsigned int  evtchn = connect->evtchn;
-    unsigned long shmem_frame = connect->shmem_frame;
+    domid_t        domid  = connect->domid;
+    unsigned int   handle = connect->blkif_handle;
+    unsigned int   evtchn = connect->evtchn;
+    unsigned long  shmem_frame = connect->shmem_frame;
     struct vm_struct *vma;
-    pgprot_t      prot;
-    int           error;
-    blkif_t      *blkif;
+    pgprot_t       prot;
+    int            error;
+    blkif_t       *blkif;
+    blkif_sring_t *sring;
 
     DPRINTK("PT got BE_CONNECT\n");
 
-    blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", 
+                connect->domid, connect->blkif_handle); 
+        connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
 
     if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
     {
@@ -112,30 +230,51 @@ void blkif_ptfe_connect(blkif_be_connect_t *connect)
         return;
     }
 
+    sring = (blkif_sring_t *)vma->addr;
+    SHARED_RING_INIT(BLKIF_RING, sring);
+    BACK_RING_INIT(BLKIF_RING, &blkif->blk_ring, sring);
+    
     blkif->evtchn        = evtchn;
     blkif->irq           = bind_evtchn_to_irq(evtchn);
     blkif->shmem_frame   = shmem_frame;
-    blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
     blkif->status        = CONNECTED;
-    /*blkif_get(blkif);*/
+    blkif_get(blkif);
 
     request_irq(blkif->irq, blkif_ptfe_int, 0, "blkif-pt-backend", blkif);
 
     connect->status = BLKIF_BE_STATUS_OKAY;
 }
 
-void blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect)
+int blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id)
 {
-    /*
-     * don't actually set the passthrough to disconnected.
-     * We just act as a pipe, and defer to the real ends to handle things like
-     * recovery.
-     */
+    domid_t       domid  = disconnect->domid;
+    unsigned int  handle = disconnect->blkif_handle;
+    blkif_t      *blkif;
 
     DPRINTK("PT got BE_DISCONNECT\n");
+    
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_disconnect attempted for non-existent blkif"
+                " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); 
+        disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return 1; /* Caller will send response error message. */
+    }
+
+    if ( blkif->status == CONNECTED )
+    {
+        blkif->status = DISCONNECTING;
+        blkif->disconnect_rspid = rsp_id;
+        wmb(); /* Let other CPUs see the status change. */
+        free_irq(blkif->irq, blkif);
+        blkif_deschedule(blkif);
+        blkif_put(blkif);
+        return 0; /* Caller should not send response message. */
+    }
 
     disconnect->status = BLKIF_BE_STATUS_OKAY;
-    return;
+    return 1;
 }
 
 /*-----[ Control Messages to/from Backend VM ]----------------------------*/
@@ -150,7 +289,7 @@ static void blkif_ptbe_send_interface_connect(void)
     };
     blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
     msg->handle      = 0;
-    msg->shmem_frame = virt_to_machine(blk_ptbe_ring) >> PAGE_SHIFT;
+    msg->shmem_frame = virt_to_machine(blktap_be_ring.sring) >> PAGE_SHIFT;
     
     ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
 }
@@ -162,9 +301,11 @@ static void blkif_ptbe_close(void)
 /* Move from CLOSED to DISCONNECTED state. */
 static void blkif_ptbe_disconnect(void)
 {
-    blk_ptbe_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
-    blk_ptbe_ring->req_prod = blk_ptbe_ring->resp_prod 
-                            = ptbe_resp_cons = ptbe_req_prod = 0;
+    blkif_sring_t *sring;
+    
+    sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
+    SHARED_RING_INIT(BLKIF_RING, sring);
+    FRONT_RING_INIT(BLKIF_RING, &blktap_be_ring, sring);
     blkif_pt_state  = BLKIF_STATE_DISCONNECTED;
     DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n");
     blkif_ptbe_send_interface_connect();
@@ -319,7 +460,9 @@ void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
         case CMSG_BLKIF_BE_DISCONNECT:
             if ( msg->length != sizeof(blkif_be_disconnect_t) )
                 goto parse_error;
-            blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0]);
+            if ( !blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0],
+                    msg->id) )
+                return;
             break;        
 
         /* We just ignore anything to do with vbds for now. */
@@ -356,3 +499,12 @@ void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
     msg->length = 0;
     ctrl_if_send_response(msg);
 }
+
+/*-----[ All control messages enter here: ]-------------------------------*/
+
+void __init blkif_interface_init(void)
+{
+    blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+                                     0, 0, NULL, NULL);
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+}
index c8733dc08860547ed827eff2921d8068cb48375d..367a83ceccebf8a258fbe1ec15ea597781e171fc 100644 (file)
@@ -5,55 +5,46 @@
  * Block request routing data path.
  * 
  * Copyright (c) 2004, Andrew Warfield
- *
+ * -- see full header in blktap.c
  */
  
 #include "blktap.h"
+#include <asm-xen/evtchn.h>
 
 /*-----[ The data paths ]-------------------------------------------------*/
-/* Connections to the frontend domains.*/
-blkif_t   ptfe_blkif; 
-/* Connection to a single backend domain. */
-blkif_ring_t *blk_ptbe_ring;   /* Ring from the PT to the BE dom    */ 
-BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
-BLKIF_RING_IDX ptbe_req_prod;  /* Private request producer.         */
 
-/* Rings up to user space. */ 
-blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
-blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
-
-/*-----[ Ring helpers ]---------------------------------------------------*/
-
-inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring)
-{
-    if (ring->type == BLKIF_REQ_RING_TYPE) {
-        blkif_req_ring_t *r = (blkif_req_ring_t *)ring;
-        return ( ( r->req_prod - r->rsp_cons ) == BLKIF_RING_SIZE );
-    }
-    
-    /* for now assume that there is always room in the response path. */
-    return 0;
-}
+/* Connection to a single backend domain. */
+blkif_front_ring_t blktap_be_ring;
 
 /*-----[ Tracking active requests ]---------------------------------------*/
 
 /* this must be the same as MAX_PENDING_REQS in blkback.c */
-#define MAX_ACTIVE_REQS 64
+#define MAX_ACTIVE_REQS ((ACTIVE_RING_IDX)64U)
 
-active_req_t  active_reqs[MAX_ACTIVE_REQS];
-unsigned char active_req_ring[MAX_ACTIVE_REQS];
-spinlock_t    active_req_lock = SPIN_LOCK_UNLOCKED;
-typedef unsigned int ACTIVE_RING_IDX;
-ACTIVE_RING_IDX active_prod, active_cons;
+active_req_t     active_reqs[MAX_ACTIVE_REQS];
+ACTIVE_RING_IDX  active_req_ring[MAX_ACTIVE_REQS];
+spinlock_t       active_req_lock = SPIN_LOCK_UNLOCKED;
+ACTIVE_RING_IDX  active_prod, active_cons;
 #define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1))
 #define ACTIVE_IDX(_ar) (_ar - active_reqs)
+#define NR_ACTIVE_REQS (MAX_ACTIVE_REQS - active_prod + active_cons)
 
 inline active_req_t *get_active_req(void) 
 {
-    ASSERT(active_cons != active_prod);    
-    return &active_reqs[MASK_ACTIVE_IDX(active_cons++)];
+    ACTIVE_RING_IDX idx;
+    active_req_t *ar;
+    unsigned long flags;
+        
+    ASSERT(active_cons != active_prod);   
+    
+    spin_lock_irqsave(&active_req_lock, flags);
+    idx =  active_req_ring[MASK_ACTIVE_IDX(active_cons++)];
+    ar = &active_reqs[idx];
+if (ar->inuse) WPRINTK("AR INUSE! (%lu)\n", ar->id);
+ar->inuse = 1;
+    spin_unlock_irqrestore(&active_req_lock, flags);
+    
+    return ar;
 }
 
 inline void free_active_req(active_req_t *ar) 
@@ -61,10 +52,16 @@ inline void free_active_req(active_req_t *ar)
     unsigned long flags;
         
     spin_lock_irqsave(&active_req_lock, flags);
+ar->inuse = 0;
     active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar);
     spin_unlock_irqrestore(&active_req_lock, flags);
 }
 
+active_req_t *lookup_active_req(ACTIVE_RING_IDX idx)
+{
+    return &active_reqs[idx];   
+}
+
 inline void active_reqs_init(void)
 {
     ACTIVE_RING_IDX i;
@@ -76,55 +73,256 @@ inline void active_reqs_init(void)
         active_req_ring[i] = i;
 }
 
+/* Requests passing through the tap to the backend hijack the id field
+ * in the request message.  In it we put the AR index _AND_ the fe domid.
+ * the domid is used by the backend to map the pages properly.
+ */
+
+static inline unsigned long MAKE_ID(domid_t fe_dom, ACTIVE_RING_IDX idx)
+{
+    return ( (fe_dom << 16) | idx );
+}
+
+inline unsigned int ID_TO_IDX(unsigned long id) 
+{ 
+        return ( id & 0x0000ffff );
+}
+
+inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
+
+/*-----[ Ring helpers ]---------------------------------------------------*/
+
+inline int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp)
+{
+    blkif_response_t *resp_d;
+    active_req_t *ar;
+    
+    /* remap id, and free the active req. blkif lookup goes here too.*/
+    ar = &active_reqs[ID_TO_IDX(rsp->id)];
+    /* WPRINTK("%3u > %3lu\n", ID_TO_IDX(rsp->id), ar->id); */
+    rsp->id = ar->id;
+    free_active_req(ar);
+            
+    resp_d = RING_GET_RESPONSE(BLKIF_RING, &blkif->blk_ring,
+            blkif->blk_ring.rsp_prod_pvt);
+    memcpy(resp_d, rsp, sizeof(blkif_response_t));
+    wmb();
+    blkif->blk_ring.rsp_prod_pvt++;
+            
+    return 0;
+}
+
+inline int write_req_to_be_ring(blkif_request_t *req)
+{
+    blkif_request_t *req_d;
+
+    req_d = RING_GET_REQUEST(BLKIF_RING, &blktap_be_ring,
+            blktap_be_ring.req_prod_pvt);
+    memcpy(req_d, req, sizeof(blkif_request_t));
+    wmb();
+    blktap_be_ring.req_prod_pvt++;
+            
+    return 0;
+}
+
+inline void kick_fe_domain(blkif_t *blkif) 
+{
+    RING_PUSH_RESPONSES(BLKIF_RING, &blkif->blk_ring);
+    notify_via_evtchn(blkif->evtchn);
+    DPRINTK("notified FE(dom %u)\n", blkif->domid);
+    
+}
+
+inline void kick_be_domain(void)
+{
+    wmb(); /* Ensure that the frontend can see the requests. */
+    RING_PUSH_REQUESTS(BLKIF_RING, &blktap_be_ring);
+    notify_via_evtchn(blkif_ptbe_evtchn);
+    DPRINTK("notified BE\n");
+}
+
 /*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/
 
+/*-----[ Scheduler list maint -from blkback ]--- */
+
+static struct list_head blkio_schedule_list;
+static spinlock_t blkio_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+    return blkif->blkdev_list.next != NULL;
+}
+
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( !__on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+    if ( __on_blkdev_list(blkif) )
+    {
+        list_del(&blkif->blkdev_list);
+        blkif->blkdev_list.next = NULL;
+        blkif_put(blkif);
+    }
+    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( __on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
+    {
+        list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
+        blkif_get(blkif);
+    }
+    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+
+/*-----[ Scheduler functions - from blkback ]--- */
+
+static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+
+static int blkio_schedule(void *arg)
+{
+    DECLARE_WAITQUEUE(wq, current);
+
+    blkif_t          *blkif;
+    struct list_head *ent;
+
+    daemonize(
+        "xentapd"
+        );
+
+    for ( ; ; )
+    {
+        /* Wait for work to do. */
+        add_wait_queue(&blkio_schedule_wait, &wq);
+        set_current_state(TASK_INTERRUPTIBLE);
+        if ( (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) || 
+             list_empty(&blkio_schedule_list) )
+            schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&blkio_schedule_wait, &wq);
+
+        /* Queue up a batch of requests. */
+        while ( (NR_ACTIVE_REQS < MAX_ACTIVE_REQS) &&
+                !list_empty(&blkio_schedule_list) )
+        {
+            ent = blkio_schedule_list.next;
+            blkif = list_entry(ent, blkif_t, blkdev_list);
+            blkif_get(blkif);
+            remove_from_blkdev_list(blkif);
+            if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+                add_to_blkdev_list_tail(blkif);
+            blkif_put(blkif);
+        }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+        /* Push the batch through to disc. */
+        run_task_queue(&tq_disk);
+#endif
+    }
+}
+
+static void maybe_trigger_blkio_schedule(void)
+{
+    /*
+     * Needed so that two processes, who together make the following predicate
+     * true, don't both read stale values and evaluate the predicate
+     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+     */
+    smp_mb();
+
+    if ( (NR_ACTIVE_REQS < (MAX_ACTIVE_REQS)) && /* XXX!!! was M_A_R/2*/
+         !list_empty(&blkio_schedule_list) ) 
+        wake_up(&blkio_schedule_wait);
+}
+
+void blkif_deschedule(blkif_t *blkif)
+{
+    remove_from_blkdev_list(blkif);
+}
+
+void __init blkdev_schedule_init(void)
+{
+    spin_lock_init(&blkio_schedule_list_lock);
+    INIT_LIST_HEAD(&blkio_schedule_list);
+
+    if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
+        BUG();
+}
+    
+/*-----[ Interrupt entry from a frontend ]------ */
+
 irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    blkif_t *blkif = dev_id;
+
+    add_to_blkdev_list_tail(blkif);
+    maybe_trigger_blkio_schedule();
+    return IRQ_HANDLED;
+}
+
+/*-----[ Other Frontend Ring functions ]-------- */
+
+/* irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)*/
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
 {
     /* we have pending messages from the real frontend. */
 
-    blkif_request_t *req_s, *req_d;
-    BLKIF_RING_IDX fe_rp;
+    blkif_request_t *req_s;
+    RING_IDX i, rp;
     unsigned long flags;
-    int notify;
-    unsigned long i;
     active_req_t *ar;
+    int more_to_do = 0;
+    int notify_be = 0, notify_user = 0;
     
     DPRINTK("PT got FE interrupt.\n");
+
+    if (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) return 1;
     
     /* lock both rings */
     spin_lock_irqsave(&blkif_io_lock, flags);
 
-    /* While there are REQUESTS on FERing: */
-    fe_rp = ptfe_blkif.blk_ring_base->req_prod;
+    rp = blkif->blk_ring.sring->req_prod;
     rmb();
-    notify = (ptfe_blkif.blk_req_cons != fe_rp);
-
-    for (i = ptfe_blkif.blk_req_cons; i != fe_rp; i++) {
-
-        /* Get the next request */
-        req_s = &ptfe_blkif.blk_ring_base->ring[MASK_BLKIF_IDX(i)].req;
+    
+    for ( i = blkif->blk_ring.req_cons; 
+         (i != rp) && 
+            !RING_REQUEST_CONS_OVERFLOW(BLKIF_RING, &blkif->blk_ring, i);
+          i++ )
+    {
+        
+        if ((--max_to_do == 0) || (NR_ACTIVE_REQS == MAX_ACTIVE_REQS)) 
+        {
+            more_to_do = 1;
+            break;
+        }
         
+        req_s = RING_GET_REQUEST(BLKIF_RING, &blkif->blk_ring, i);
         /* This is a new request:  
          * Assign an active request record, and remap the id. 
          */
         ar = get_active_req();
         ar->id = req_s->id;
-        req_s->id = ACTIVE_IDX(ar);
-        DPRINTK("%3lu < %3lu\n", req_s->id, ar->id);
+        ar->blkif = blkif;
+        req_s->id = MAKE_ID(blkif->domid, ACTIVE_IDX(ar));
+        /* WPRINTK("%3u < %3lu\n", ID_TO_IDX(req_s->id), ar->id); */
 
         /* FE -> BE interposition point is here. */
         
         /* ------------------------------------------------------------- */
         /* BLKIF_OP_PROBE_HACK:                                          */
-        /* Until we have grant tables, we need to allow the backent to   */
-        /* map pages that are either from this domain, or more commonly  */
-        /* from the real front end.  We achieve this in a terrible way,  */
-        /* by passing the front end's domid allong with PROBE messages   */
-        /* Once grant tables appear, this should all go away.            */
+        /* Signal to the backend that we are a tap domain.               */
 
         if (req_s->operation == BLKIF_OP_PROBE) {
-            DPRINTK("Adding FE domid to PROBE request.\n");
-            (domid_t)(req_s->frame_and_sects[1]) = ptfe_blkif.domid;
+            DPRINTK("Adding BLKTAP_COOKIE to PROBE request.\n");
+            req_s->frame_and_sects[1] = BLKTAP_COOKIE;
         }
 
         /* ------------------------------------------------------------- */
@@ -137,12 +335,9 @@ irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
             /* In MODE_INTERCEPT_FE, map attached pages into the app vma */
             /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */
 
-            /* XXX: mapping/copying of attached pages is still not done! */
-
             DPRINTK("req->UFERing\n"); 
             blktap_write_fe_ring(req_s);
-
-
+            notify_user = 1;
         }
 
         /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */
@@ -153,61 +348,27 @@ irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
             /* copy the request message to the BERing */
 
             DPRINTK("blktap: FERing[%u] -> BERing[%u]\n", 
-                    (unsigned)MASK_BLKIF_IDX(i)
-                    (unsigned)MASK_BLKIF_IDX(ptbe_req_prod));
-
-            req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
+                    (unsigned)__SHARED_RING_MASK(BLKIF_RING
+                        blktap_be_ring.sring, i), 
+                    (unsigned)__SHARED_RING_MASK(BLKIF_RING, 
+                        blktap_be_ring.sring, blktap_be_ring.req_prod_pvt));
             
-            memcpy(req_d, req_s, sizeof(blkif_request_t));
-
-            ptbe_req_prod++;
-        }
-    }
-
-    ptfe_blkif.blk_req_cons = i;
-
-    /* If we have forwarded any responses, notify the appropriate ends. */
-    if (notify) {
-
-        /* we have sent stuff to the be, notify it. */
-        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
-               (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
-            wmb();
-            blk_ptbe_ring->req_prod = ptbe_req_prod;
-
-            notify_via_evtchn(blkif_ptbe_evtchn);
-            DPRINTK(" -- and notified.\n");
-        }
-
-        /* we sent stuff to the app, notify it. */
-        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
-             (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
-
-            blktap_kick_user();
+            write_req_to_be_ring(req_s);
+            notify_be = 1;
         }
     }
 
+    blkif->blk_ring.req_cons = i;
+    
     /* unlock rings */
     spin_unlock_irqrestore(&blkif_io_lock, flags);
-
-    return IRQ_HANDLED;
-}
-
-inline int write_req_to_be_ring(blkif_request_t *req)
-{
-    blkif_request_t *req_d;
-
-    req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
-    memcpy(req_d, req, sizeof(blkif_request_t));
-    ptbe_req_prod++;
-
-    return 0;
-}
-
-inline void kick_be_domain(void) {
-    wmb();
-    blk_ptbe_ring->req_prod = ptbe_req_prod;
-    notify_via_evtchn(blkif_ptbe_evtchn);
+    
+    if (notify_user)
+        blktap_kick_user();
+    if (notify_be)
+        kick_be_domain();
+    
+    return more_to_do;
 }
 
 /*-----[ Data to/from Backend (server) VM ]------------------------------*/
@@ -216,31 +377,27 @@ inline void kick_be_domain(void) {
 irqreturn_t blkif_ptbe_int(int irq, void *dev_id, 
                                   struct pt_regs *ptregs)
 {
-    blkif_response_t  *resp_s, *resp_d;
-    BLKIF_RING_IDX be_rp;
+    blkif_response_t  *resp_s;
+    blkif_t *blkif;
+    RING_IDX rp, i;
     unsigned long flags;
-    int notify;
-    unsigned long i;
-    active_req_t *ar;
 
     DPRINTK("PT got BE interrupt.\n");
 
     /* lock both rings */
     spin_lock_irqsave(&blkif_io_lock, flags);
     
-    /* While there are RESPONSES on BERing: */
-    be_rp = blk_ptbe_ring->resp_prod;
+    rp = blktap_be_ring.sring->rsp_prod;
     rmb();
-    notify = (ptbe_resp_cons != be_rp);
-    
-    for ( i = ptbe_resp_cons; i != be_rp; i++ )
+      
+    for ( i = blktap_be_ring.rsp_cons; i != rp; i++)
     {
-        /* BE -> FE interposition point is here. */
+        resp_s = RING_GET_RESPONSE(BLKIF_RING, &blktap_be_ring, i);
         
-        /* Get the next response */
-        resp_s = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(i)].resp;
+        /* BE -> FE interposition point is here. */
     
-       
+        blkif = active_reqs[ID_TO_IDX(resp_s->id)].blkif;
+        
         /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */
         if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
              (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
@@ -249,10 +406,9 @@ irqreturn_t blkif_ptbe_int(int irq, void *dev_id,
             /* In MODE_INTERCEPT_BE, map attached pages into the app vma */
             /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */
 
-            /* XXX: copy/map the attached page! */
-
             DPRINTK("rsp->UBERing\n"); 
             blktap_write_be_ring(resp_s);
+            blktap_kick_user();
 
         }
        
@@ -264,254 +420,49 @@ irqreturn_t blkif_ptbe_int(int irq, void *dev_id,
             /* Copy the response message to FERing */
          
             DPRINTK("blktap: BERing[%u] -> FERing[%u]\n", 
-                    (unsigned) MASK_BLKIF_IDX(i), 
-                    (unsigned) MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod));
-
-            /* remap id, and free the active req. blkif lookup goes here too.*/
-            ar = &active_reqs[resp_s->id];
-            DPRINTK("%3lu > %3lu\n", resp_s->id, ar->id);
-            resp_s->id = ar->id;
-            free_active_req(ar);
-           
-            resp_d = &ptfe_blkif.blk_ring_base->ring[
-                MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
-
-            memcpy(resp_d, resp_s, sizeof(blkif_response_t));
-            
-            ptfe_blkif.blk_resp_prod++;
+                    (unsigned)__SHARED_RING_MASK(BLKIF_RING, 
+                        blkif->blk_ring.sring, i), 
+                    (unsigned)__SHARED_RING_MASK(BLKIF_RING, 
+                        blkif->blk_ring.sring, 
+                        blkif->blk_ring.rsp_prod_pvt));
 
-        }
-    }
-
-    ptbe_resp_cons = i;
-    
-    /* If we have forwarded any responses, notify the apropriate domains. */
-    if (notify) {
+            write_resp_to_fe_ring(blkif, resp_s);
+            kick_fe_domain(blkif);
 
-        /* we have sent stuff to the fe.  notify it. */
-        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
-               (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
-            wmb();
-            ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
-        
-            notify_via_evtchn(ptfe_blkif.evtchn);
-            DPRINTK(" -- and notified.\n");
-        }
-
-        /* we sent stuff to the app, notify it. */
-        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
-             (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
-
-            blktap_kick_user();
         }
     }
-
-    spin_unlock_irqrestore(&blkif_io_lock, flags);
-    return IRQ_HANDLED;
-}
-
-inline int write_resp_to_fe_ring(blkif_response_t *rsp)
-{
-    blkif_response_t *resp_d;
-    active_req_t *ar;
     
-    /* remap id, and free the active req. blkif lookup goes here too.*/
-    ar = &active_reqs[rsp->id];
-    DPRINTK("%3lu > %3lu\n", rsp->id, ar->id);
-    rsp->id = ar->id;
-    free_active_req(ar);
-            
-    resp_d = &ptfe_blkif.blk_ring_base->ring[
-        MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
-
-    memcpy(resp_d, rsp, sizeof(blkif_response_t));
-    ptfe_blkif.blk_resp_prod++;
-
-    return 0;
-}
-
-inline void kick_fe_domain(void) {
-    wmb();
-    ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
-    notify_via_evtchn(ptfe_blkif.evtchn);
+    blktap_be_ring.rsp_cons = i;
     
-}
-
-static inline void flush_requests(void)
-{
-    wmb(); /* Ensure that the frontend can see the requests. */
-    blk_ptbe_ring->req_prod = ptbe_req_prod;
-    notify_via_evtchn(blkif_ptbe_evtchn);
-}
-
-/*-----[ Data to/from user space ]----------------------------------------*/
-
-
-int blktap_write_fe_ring(blkif_request_t *req)
-{
-    blkif_request_t *target;
-    int error, i;
-
-    /*
-     * This is called to pass a request from the real frontend domain's
-     * blkif ring to the character device.
-     */
-
-    if ( ! blktap_ring_ok ) {
-        DPRINTK("blktap: fe_ring not ready for a request!\n");
-        return 0;
-    }
-
-    if ( BLKTAP_RING_FULL(RING(&fe_ring)) ) {
-        DPRINTK("blktap: fe_ring is full, can't add.\n");
-        return 0;
-    }
-
-    target = &fe_ring.ring->ring[MASK_BLKIF_IDX(fe_ring.req_prod)].req;
-    memcpy(target, req, sizeof(*req));
-
-/* maybe move this stuff out into a seperate func ------------------- */
-
-    /*
-     * For now, map attached page into a fixed position into the vma.
-     * XXX: make this map to a free page.
-     */
-
-    /* Attempt to map the foreign pages directly in to the application */
-    for (i=0; i<target->nr_segments; i++) {
-
-        /* get an unused virtual address from the char device */
-        /* store the old page address */
-        /* replace the address with the virtual address */
-
-        /* blktap_vma->vm_start+((2+i)*PAGE_SIZE) */
-
-        error = direct_remap_area_pages(blktap_vma->vm_mm, 
-                                        MMAP_VADDR(req->id, i), 
-                                        target->frame_and_sects[0] & PAGE_MASK,
-                                        PAGE_SIZE,
-                                        blktap_vma->vm_page_prot,
-                                        ptfe_blkif.domid);
-        if ( error != 0 ) {
-            printk(KERN_INFO "remapping attached page failed! (%d)\n", error);
-            return 0;
-        }
-    }
-    /* fix the address of the attached page in the message. */
-    /* TODO:      preserve the segment number stuff here... */
-    /* target->frame_and_sects[0] = blktap_vma->vm_start + PAGE_SIZE;*/
-/* ------------------------------------------------------------------ */
 
+    spin_unlock_irqrestore(&blkif_io_lock, flags);
     
-    fe_ring.req_prod++;
-
-    return 0;
-}
-
-int blktap_write_be_ring(blkif_response_t *rsp)
-{
-    blkif_response_t *target;
-
-    /*
-     * This is called to pass a request from the real backend domain's
-     * blkif ring to the character device.
-     */
-
-    if ( ! blktap_ring_ok ) {
-        DPRINTK("blktap: be_ring not ready for a request!\n");
-        return 0;
-    }
-
-    if ( BLKTAP_RING_FULL(RING(&be_ring)) ) {
-        DPRINTK("blktap: be_ring is full, can't add.\n");
-        return 0;
-    }
-
-    target = &be_ring.ring->ring[MASK_BLKIF_IDX(be_ring.rsp_prod)].resp;
-    memcpy(target, rsp, sizeof(*rsp));
-
-
-    /* XXX: map attached pages and fix-up addresses in the copied address. */
-
-    be_ring.rsp_prod++;
-
-    return 0;
+    return IRQ_HANDLED;
 }
 
-int blktap_read_fe_ring(void)
-{
-    /* This is called to read responses from the UFE ring. */
-
-    BLKIF_RING_IDX fe_rp;
-    unsigned long i;
-    int notify;
-
-    DPRINTK("blktap_read_fe_ring()\n");
-
-    fe_rp = fe_ring.ring->resp_prod;
-    rmb();
-    notify = (fe_rp != fe_ring.rsp_cons);
-
-    /* if we are forwarding from UFERring to FERing */
-    if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
-
-        /* for each outstanding message on the UFEring  */
-        for ( i = fe_ring.rsp_cons; i != fe_rp; i++ ) {
+/* Debug : print the current ring indices. */
 
-            /* XXX: remap pages on that message as necessary */
-            /* copy the message to the UBEring */
-
-            DPRINTK("resp->fe_ring\n");
-            write_resp_to_fe_ring(&fe_ring.ring->ring[MASK_BLKIF_IDX(i)].resp);
-        }
-    
-        fe_ring.rsp_cons = fe_rp;
-
-        /* notify the fe if necessary */
-        if ( notify ) {
-            DPRINTK("kick_fe_domain()\n");
-            kick_fe_domain();
-        }
-    }
-
-    return 0;
-}
-
-int blktap_read_be_ring(void)
+void print_vm_ring_idxs(void)
 {
-    /* This is called to read responses from the UBE ring. */
-
-    BLKIF_RING_IDX be_rp;
-    unsigned long i;
-    int notify;
-
-    DPRINTK("blktap_read_be_ring()\n");
-
-    be_rp = be_ring.ring->req_prod;
-    rmb();
-    notify = (be_rp != be_ring.req_cons);
-
-    /* if we are forwarding from UFERring to FERing */
-    if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
-
-        /* for each outstanding message on the UFEring  */
-        for ( i = be_ring.req_cons; i != be_rp; i++ ) {
-
-            /* XXX: remap pages on that message as necessary */
-            /* copy the message to the UBEring */
-
-            DPRINTK("req->be_ring\n");
-            write_req_to_be_ring(&be_ring.ring->ring[MASK_BLKIF_IDX(i)].req);
-        }
-    
-        be_ring.req_cons = be_rp;
-
-        /* notify the fe if necessary */
-        if ( notify ) {
-            DPRINTK("kick_be_domain()\n");
-            kick_be_domain();
-        }
+    int i;
+    blkif_t *blkif;
+            
+    WPRINTK("FE Rings: \n---------\n");
+    for ( i = 0; i < 50; i++) { 
+        blkif = blkif_find_by_handle((domid_t)i, 0);
+        if (blkif != NULL)
+            WPRINTK("%2d: req_cons: %2d, rsp_prod_prv: %2d "
+                "| req_prod: %2d, rsp_prod: %2d\n", i, 
+                blkif->blk_ring.req_cons,
+                blkif->blk_ring.rsp_prod_pvt,
+                blkif->blk_ring.sring->req_prod,
+                blkif->blk_ring.sring->rsp_prod);
     }
-
-    return 0;
-}
+    WPRINTK("BE Ring: \n--------\n");
+    WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d "
+        "| req_prod: %2d, rsp_prod: %2d\n",
+        blktap_be_ring.rsp_cons,
+        blktap_be_ring.req_prod_pvt,
+        blktap_be_ring.sring->req_prod,
+        blktap_be_ring.sring->rsp_prod);
+}        
index c10e3f3a443eebb79a1156d324174319f65bd7a0..500270259cc2256e50a58d24ef044df22fbe90e2 100644 (file)
@@ -37,6 +37,10 @@ struct vm_area_struct *blktap_vma;
 unsigned long mmap_vstart;
 unsigned long rings_vstart;
 
+/* Rings up to user space. */
+static blkif_front_ring_t blktap_ufe_ring;
+static blkif_back_ring_t  blktap_ube_ring;
+
 /* -------[ blktap vm ops ]------------------------------------------- */
 
 static struct page *blktap_nopage(struct vm_area_struct *vma,
@@ -61,41 +65,39 @@ struct vm_operations_struct blktap_vm_ops = {
 
 static int blktap_open(struct inode *inode, struct file *filp)
 {
+    blkif_sring_t *sring;
+    
     if ( test_and_set_bit(0, &blktap_dev_inuse) )
         return -EBUSY;
 
     printk(KERN_ALERT "blktap open.\n");
 
     /* Allocate the fe ring. */
-    fe_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
-    if (fe_ring.ring == NULL)
+    sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+    if (sring == NULL)
         goto fail_nomem;
 
-    SetPageReserved(virt_to_page(fe_ring.ring));
+    SetPageReserved(virt_to_page(sring));
     
-    fe_ring.ring->req_prod = fe_ring.ring->resp_prod
-                           = fe_ring.req_prod
-                           = fe_ring.rsp_cons
-                           = 0;
+    SHARED_RING_INIT(BLKIF_RING, sring);
+    FRONT_RING_INIT(BLKIF_RING, &blktap_ufe_ring, sring);
 
     /* Allocate the be ring. */
-    be_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
-    if (be_ring.ring == NULL)
+    sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+    if (sring == NULL)
         goto fail_free_fe;
 
-    SetPageReserved(virt_to_page(be_ring.ring));
+    SetPageReserved(virt_to_page(sring));
     
-    be_ring.ring->req_prod = be_ring.ring->resp_prod
-                           = be_ring.rsp_prod
-                           = be_ring.req_cons
-                           = 0;
+    SHARED_RING_INIT(BLKIF_RING, sring);
+    BACK_RING_INIT(BLKIF_RING, &blktap_ube_ring, sring);
 
     DPRINTK(KERN_ALERT "blktap open.\n");
 
     return 0;
 
  fail_free_fe:
-    free_page( (unsigned long) fe_ring.ring);
+    free_page( (unsigned long) blktap_ufe_ring.sring);
 
  fail_nomem:
     return -ENOMEM;
@@ -109,11 +111,11 @@ static int blktap_release(struct inode *inode, struct file *filp)
     printk(KERN_ALERT "blktap closed.\n");
 
     /* Free the ring page. */
-    ClearPageReserved(virt_to_page(fe_ring.ring));
-    free_page((unsigned long) fe_ring.ring);
+    ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
+    free_page((unsigned long) blktap_ufe_ring.sring);
 
-    ClearPageReserved(virt_to_page(be_ring.ring));
-    free_page((unsigned long) be_ring.ring);
+    ClearPageReserved(virt_to_page(blktap_ube_ring.sring));
+    free_page((unsigned long) blktap_ube_ring.sring);
     
     return 0;
 }
@@ -146,16 +148,18 @@ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
     /* not sure if I really need to do this... */
     vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-    DPRINTK("Mapping be_ring page %lx.\n", __pa(be_ring.ring));
-    if (remap_page_range(vma, vma->vm_start, __pa(be_ring.ring), PAGE_SIZE, 
-                         vma->vm_page_prot)) {
-        printk(KERN_ERR "be_ring: remap_page_range failure!\n");
+    DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring));
+    if (remap_page_range(vma, vma->vm_start, 
+                         __pa(blktap_ube_ring.sring), 
+                         PAGE_SIZE, vma->vm_page_prot)) {
+        WPRINTK("be_ring: remap_page_range failure!\n");
     }
 
-    DPRINTK("Mapping fe_ring page %lx.\n", __pa(fe_ring.ring));
-    if (remap_page_range(vma, vma->vm_start + PAGE_SIZE, __pa(fe_ring.ring), 
+    DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring));
+    if (remap_page_range(vma, vma->vm_start + PAGE_SIZE, 
+                         __pa(blktap_ufe_ring.sring), 
                          PAGE_SIZE, vma->vm_page_prot)) {
-        printk(KERN_ERR "fe_ring: remap_page_range failure!\n");
+        WPRINTK("fe_ring: remap_page_range failure!\n");
     }
 
     blktap_vma = vma;
@@ -181,7 +185,24 @@ static int blktap_ioctl(struct inode *inode, struct file *filp,
             printk(KERN_INFO "blktap: set mode to %lx\n", arg);
             return 0;
         }
-        /* XXX: return a more meaningful error case here. */
+    case BLKTAP_IOCTL_PRINT_IDXS:
+        {
+            print_vm_ring_idxs();
+            WPRINTK("User Rings: \n-----------\n");
+            WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
+                            "| req_prod: %2d, rsp_prod: %2d\n",
+                            blktap_ufe_ring.rsp_cons,
+                            blktap_ufe_ring.req_prod_pvt,
+                            blktap_ufe_ring.sring->req_prod,
+                            blktap_ufe_ring.sring->rsp_prod);
+            WPRINTK("UB: req_cons: %2d, rsp_prod_prv: %2d "
+                            "| req_prod: %2d, rsp_prod: %2d\n",
+                            blktap_ube_ring.req_cons,
+                            blktap_ube_ring.rsp_prod_pvt,
+                            blktap_ube_ring.sring->req_prod,
+                            blktap_ube_ring.sring->rsp_prod);
+            
+        }
     }
     return -ENOIOCTLCMD;
 }
@@ -190,11 +211,11 @@ static unsigned int blktap_poll(struct file *file, poll_table *wait)
 {
         poll_wait(file, &blktap_wait, wait);
 
-        if ( (fe_ring.req_prod != fe_ring.ring->req_prod) ||
-             (be_ring.rsp_prod != be_ring.ring->resp_prod) ) {
+        if ( RING_HAS_UNPUSHED_REQUESTS(BLKIF_RING, &blktap_ufe_ring) ||
+             RING_HAS_UNPUSHED_RESPONSES(BLKIF_RING, &blktap_ube_ring) ) {
 
-            fe_ring.ring->req_prod = fe_ring.req_prod;
-            be_ring.ring->resp_prod = be_ring.rsp_prod;
+            RING_PUSH_REQUESTS(BLKIF_RING, &blktap_ufe_ring);
+            RING_PUSH_RESPONSES(BLKIF_RING, &blktap_ube_ring);
             return POLLIN | POLLRDNORM;
         }
 
@@ -215,7 +236,149 @@ static struct file_operations blktap_fops = {
     release:  blktap_release,
     mmap:     blktap_mmap,
 };
+    
+/*-----[ Data to/from user space ]----------------------------------------*/
+
+
+int blktap_write_fe_ring(blkif_request_t *req)
+{
+    blkif_request_t *target;
+    int error, i;
 
+    /*
+     * This is called to pass a request from the real frontend domain's
+     * blkif ring to the character device.
+     */
+
+    if ( ! blktap_ring_ok ) {
+        DPRINTK("blktap: ufe_ring not ready for a request!\n");
+        return 0;
+    }
+
+    if ( RING_FULL(BLKIF_RING, &blktap_ufe_ring) ) {
+        DPRINTK("blktap: fe_ring is full, can't add.\n");
+        return 0;
+    }
+
+    //target = RING_NEXT_EMPTY_REQUEST(BLKIF_RING, &blktap_ufe_ring);
+    target = RING_GET_REQUEST(BLKIF_RING, &blktap_ufe_ring,
+            blktap_ufe_ring.req_prod_pvt);
+    memcpy(target, req, sizeof(*req));
+
+    /* Attempt to map the foreign pages directly in to the application */
+    for (i=0; i<target->nr_segments; i++) {
+
+        error = direct_remap_area_pages(blktap_vma->vm_mm, 
+                                        MMAP_VADDR(ID_TO_IDX(req->id), i), 
+                                        target->frame_and_sects[0] & PAGE_MASK,
+                                        PAGE_SIZE,
+                                        blktap_vma->vm_page_prot,
+                                        ID_TO_DOM(req->id));
+        if ( error != 0 ) {
+            printk(KERN_INFO "remapping attached page failed! (%d)\n", error);
+            /* the request is now dropped on the floor. */
+            return 0;
+        }
+    }
+    
+    blktap_ufe_ring.req_prod_pvt++;
+    
+    return 0;
+}
+
+int blktap_write_be_ring(blkif_response_t *rsp)
+{
+    blkif_response_t *target;
+
+    /*
+     * This is called to pass a request from the real backend domain's
+     * blkif ring to the character device.
+     */
+
+    if ( ! blktap_ring_ok ) {
+        DPRINTK("blktap: be_ring not ready for a request!\n");
+        return 0;
+    }
+
+    /* No test for fullness in the response direction. */
+
+    //target = RING_NEXT_EMPTY_RESPONSE(BLKIF_RING, &blktap_ube_ring);
+    target = RING_GET_RESPONSE(BLKIF_RING, &blktap_ube_ring,
+            blktap_ube_ring.rsp_prod_pvt);
+    memcpy(target, rsp, sizeof(*rsp));
+
+    /* no mapping -- pages were mapped in blktap_write_fe_ring() */
+
+    blktap_ube_ring.rsp_prod_pvt++;
+    
+    return 0;
+}
+
+int blktap_read_fe_ring(void)
+{
+    /* This is called to read responses from the UFE ring. */
+
+    RING_IDX i, rp;
+    blkif_response_t *resp_s;
+    blkif_t *blkif;
+    active_req_t *ar;
+
+    DPRINTK("blktap_read_fe_ring()\n");
+
+    /* if we are forwarding from UFERring to FERing */
+    if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
+
+        /* for each outstanding message on the UFEring  */
+        //RING_FOREACH_RESPONSE(BLKIF_RING, &blktap_ufe_ring, prod, resp_s) {
+        rp = blktap_ufe_ring.sring->rsp_prod;
+        rmb();
+        
+        for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ )
+        {
+            resp_s = RING_GET_RESPONSE(BLKIF_RING, &blktap_ufe_ring, i);
+            
+            DPRINTK("resp->fe_ring\n");
+            ar = lookup_active_req(ID_TO_IDX(resp_s->id));
+            blkif = ar->blkif;
+            write_resp_to_fe_ring(blkif, resp_s);
+            kick_fe_domain(blkif);
+        }
+        
+        blktap_ufe_ring.rsp_cons = i;
+    }
+    return 0;
+}
+
+int blktap_read_be_ring(void)
+{
+    /* This is called to read requests from the UBE ring. */
+
+    RING_IDX i, rp;
+    blkif_request_t *req_s;
+
+    DPRINTK("blktap_read_be_ring()\n");
+
+    /* if we are forwarding from UFERring to FERing */
+    if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
+
+        /* for each outstanding message on the UFEring  */
+        //RING_FOREACH_REQUEST(BLKIF_RING, &blktap_ube_ring, prod, req_s) {
+        rp = blktap_ube_ring.sring->req_prod;
+        rmb();
+        for ( i = blktap_ube_ring.req_cons; i != rp; i++ )
+        {
+            req_s = RING_GET_REQUEST(BLKIF_RING, &blktap_ube_ring, i);
+
+            DPRINTK("req->be_ring\n");
+            write_req_to_be_ring(req_s);
+            kick_be_domain();
+        }
+        
+        blktap_ube_ring.req_cons = i;
+    }
+
+    return 0;
+}
 /* -------[ blktap module setup ]------------------------------------- */
 
 static struct miscdevice blktap_miscdev = {